# -*- coding: utf-8 -*-
"""
Created on Wed Nov 10 14:31:49 2021

@author: perlita
@title: project 5
"""

import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import sklearn.datasets
import sklearn.model_selection

# Question 2

# Implementing logistic regression using breast cancer dataset

dataset = sk.datasets.load_breast_cancer()
X = dataset.data
y = dataset.target

X_train, X_val, y_train, y_val = \
    sk.model_selection.train_test_split(X, y, train_size = .8)
    
mu = np.mean(X_train, axis = 0)
s = np.std(X_train, axis = 0)
X_train = (X_train - mu)/s
X_val = (X_val - mu)/s

X_train = np.insert(X_train, 0, 1, axis = 1)
X_val = np.insert(X_val, 0, 1, axis = 1)

def sigmoid(u):
    expu = np.exp(u)
    return expu/(1 + expu)

def cross_entropy(p, q):

    return -p*np.log(q) - (1-p)*np.log(1-q)

def eval_L(beta, X, y):
    
    N = X.shape[0]
    L = 0.0
    for i in range(N):
        xiHat = X[i]
        yi = y[i]
        q = sigmoid(np.vdot(xiHat, beta))
        L += cross_entropy(yi, q)
        
    return L

def grad_L(beta, X, y):
    
    N = X.shape[0]
    d = X.shape[1] - 1
    grad = np.zeros(d+1)
    
    for i in range(N):
        
        xiHat = X[i]
        yi = y[i]
        q = sigmoid(np.vdot(xiHat, beta))
        grad += (q - yi)*xiHat
    
    return grad*(1/N)

#%%

# Question 2 continued

max_iter = 10000
t = 0.1
d = X_train.shape[1] - 1
betak = np.zeros(d+1)
L_vals = []

for k in range(max_iter):
    
    L_vals.append(eval_L(betak, X_train, y_train))
    grad = grad_L(betak, X_train, y_train)
    betak = betak - t*grad
    
plt.semilogy(L_vals)
plt.title("Object Function Value vs Iteration k for Breast Cancer Dataset")

predictions = sigmoid(X_val @ betak)
augmented_predictions = np.zeros(X_val.shape[0])

for i in range(X_val.shape[0]):
    
    if predictions[i] < 0.5:
        augmented_predictions[i] = 0
    else:
        augmented_predictions[i] = 1
        
accuracy = sum(augmented_predictions == y_val)/len(y_val)

print('Iterations of gradient descent used: ', max_iter)
print('Best gradient descent step size I could use: ', t)
print("Accuracy percentage: ", accuracy)

#%%

# Question 3

# Implementing logistic regression for using MNIST

MNIST_dataset = \
    sk.datasets.fetch_openml('mnist_784', as_frame = False,
                             data_home =\
            '/Users/perli/OneDrive/Documents/MATH 375/projects/project05')
X = MNIST_dataset.data
labels = MNIST_dataset.target
N = len(labels)
y = np.zeros(N)
for i in range(N):
    
    if labels[i] != '0':
        y[i] = 1

#Splitting the data
N_train = 5000
X_train = X[0:N_train]
y_train = y[0:N_train]

X_val = X[N_train:]
y_val = y[N_train:]

# Standardizing the data
X_train = X_train/255.0
X_val = X_val/255.0

# Augment the data
X_train = np.insert(X_train, 0, 1, axis = 1)
X_val = np.insert(X_val, 0, 1, axis = 1)

max_iter = 700
t = 0.1
d = X_train.shape[1] - 1
betak = np.zeros(d+1)
L_vals = []

for k in range(max_iter):
    
    L_vals.append(eval_L(betak, X_train, y_train))
    grad = grad_L(betak, X_train, y_train)
    betak = betak - t*grad
    
plt.semilogy(L_vals)
plt.title('Object Function Value vs Iteration k for MNIST Dataset')

#%%

# Question 3 continued

predictions = sigmoid(X_val @ betak)
sorted_idxs = np.argsort(predictions)
sorted_idxs = np.flip(sorted_idxs)
augmented_predictions = np.zeros(X_val.shape[0])

for i in range(X_val.shape[0]):
    
    if predictions[i] < 0.5:
        augmented_predictions[i] = 0
    else:
        augmented_predictions[i] = 1
        
accuracy = sum(augmented_predictions == y_val)/len(y_val)

print('Iterations of gradient descent used: ', max_iter)
print('Best gradient descent step size I could use: ', t)
print("Accuracy percentage: ", accuracy)

#%%

# Question 3 continued

most_confusing = []
for i in sorted_idxs:
    
    if augmented_predictions[i] != y_val[i]:
        most_confusing.append(i)
        
    if len(most_confusing) == 5:
        break
 
print('Top Five Most Confusing Images: ')
num_row = 2
num_col = 3
counter = 0
fig, axes = plt.subplots(num_row, num_col, figsize=(1.5*num_col,2*num_row))
for i in most_confusing:
    ax = axes[counter//num_col, counter%num_col]
    ax.imshow(X[i].reshape(28, 28), cmap='gray_r')
    ax.set_title('Label: {}'.format(labels[i]))
    counter += 1
    
    print(counter, ') Label: ', labels[i])
    print('Index value: ', i)
    print('')
plt.tight_layout()
plt.show()

# Response: I believe the reason behind the confusion on these images 
#   probably has to do with the curves on these numbers since that seems to 
#   be the only similarity shared with all these images. I also have a 
#   feeling that reading curves is a lot more difficult for a computer/
#   machine than reading straight lines or curves that match a recognizable
#   mathematical equation.


